/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
 * \file kernel_operator_common_impl.h
 * \brief AscendC l310 support common api.
 */
#ifndef ASCENDC_MODULE_OPERATOR_COMMON_IMPL_H
#define ASCENDC_MODULE_OPERATOR_COMMON_IMPL_H
#include "kernel_struct_mm.h"

#define GetLoopBoundB8(x) (((x) - 1) / VECTOR_REG_WIDTH)
#define GetLoopBoundB16(x) (((x) - 1) / (VECTOR_REG_WIDTH / B16_BYTE_SIZE))
#define GetLoopBoundB32(x) (((x) - 1) / (VECTOR_REG_WIDTH / B32_BYTE_SIZE))

namespace AscendC {
__aicore__ inline int64_t GetSubBlockIdxImpl()
{
    return 0;
}

__aicore__ inline int64_t GetTaskRationImpl()
{
    return 1;
}

__aicore__ inline int64_t GetBlockIdxImpl()
{
    return block_idx;
}

__aicore__ inline void SetSysWorkspace(GM_ADDR workspace)
{
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
    ASCENDC_ASSERT((workspace != nullptr), { KERNEL_LOG(KERNEL_ERROR, "workspace can not be nullptr");});
#else
    if (g_sysWorkspaceReserved == nullptr) {
        g_sysWorkspaceReserved = workspace;
    }
#endif
}

__aicore__ inline void SetSysWorkspaceForce(GM_ADDR workspace)
{
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
    ASCENDC_ASSERT((workspace != nullptr), { KERNEL_LOG(KERNEL_ERROR, "workspace can not be nullptr");});
#else
    g_sysWorkspaceReserved = workspace;
#endif
}

__aicore__ inline GM_ADDR GetUserWorkspace(GM_ADDR workspace)
{
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
    ASCENDC_ASSERT((workspace != nullptr), { KERNEL_LOG(KERNEL_ERROR, "workspace can not be nullptr");});
    return workspace;
#else
    (void)(workspace);
    // reserved 0 Bytes
    return g_sysWorkspaceReserved;
#endif
}

template<bool isAIVOnly = true>
__aicore__ inline void SoftSyncAllImpl(__gm__ int32_t* gmWorkspaceAddr, __ubuf__ int32_t* ubWorkspaceAddr,
    const int usedCores)
{
    ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "unsupported software syncAll!");});
}

template<bool isAIVOnly = true>
__aicore__ inline void SyncAllImpl()
{
    ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "unsupported hardware syncAll!");});
}

template <pipe_t pipe>
__aicore__ inline void NotifyEventImpl(uint16_t flagId)
{
    ASCENDC_ASSERT((false), "NotifyEvent is not supported!");
}

template <uint8_t modeId, pipe_t pipe>
__aicore__ inline void WaitEventImpl(uint16_t flagId)
{
    ASCENDC_ASSERT((false), "WaitEvent is not supported!");
}

template <atomic_type_t type, atomic_op_t op>
__aicore__ inline void SetStoreAtomicConfigImpl()
{
    ASCENDC_ASSERT((false), "SetStoreAtomicConfig is not supported!");
}

__aicore__ inline int64_t GetStoreAtomicConfigImpl()
{
    ASCENDC_ASSERT((false), "GetStoreAtomicConfig is not supported!");
    return 0;
}

__aicore__ inline void GetStoreAtomicConfigImpl(uint16_t &atomicType, uint16_t &atomicOp)
{
    ASCENDC_ASSERT((false), "GetStoreAtomicConfig is not supported!");
}

__aicore__ inline void SetSyncBaseAddrImpl(uint64_t config)
{
    ASCENDC_ASSERT((false), "SetSyncBaseAddr is not supported!");
}

template <typename T>
__aicore__ inline void DataCachePreloadImpl(const GlobalTensor<uint64_t> &src, const T cacheOffset)
{
    ASCENDC_ASSERT((false), "DataCachePreload is not supported!");
}

__aicore__ inline int64_t GetICachePreloadStatusImpl()
{
    ASCENDC_ASSERT((false), "GetICachePreloadStatus is not supported!");
    return 0;
}

__aicore__ inline void PreLoad(const int64_t preFetchLen)
{
    ASCENDC_ASSERT((false), "ICachePreLoad is not supported!");
}

__aicore__ inline void CheckLocalMemoryIAImpl(const CheckLocalMemoryIAParam& checkParams)
{
    ASCENDC_ASSERT((false), "enableBit is not supported!");
}

template <typename T> struct TypeGet;
template <> struct TypeGet<uint32_t> {
    using T = vector_u32;
};
template <> struct TypeGet<int32_t> {
    using T = vector_s32;
};
template <> struct TypeGet<float> {
    using T = vector_f32;
};
template <> struct TypeGet<uint16_t> {
    using T = vector_u16;
};
template <> struct TypeGet<half> {
    using T = vector_f16;
};
template <> struct TypeGet<int16_t> {
    using T = vector_s16;
};
template <> struct TypeGet<uint8_t> {
    using T = vector_u8;
};
template <> struct TypeGet<int8_t> {
    using T = vector_s8;
};
 
using MaskReg = vector_bool;
using UnalignReg = vector_align;
using AddrReg = vector_address;
 
#define LocalMem __ubuf__
 
template <typename T> struct RegTensor {
    __aicore__ inline RegTensor(){};
    using RegType = typename TypeGet<T>::T;
    RegType reg;
 
    __aicore__ inline operator RegType &()
    {
        return reg;
    }
    __aicore__ void Print() const;
};
 
template <class T> __aicore__ inline void LocalMemBar(T mem_type);
 
enum class PPMode {
    UNKNOWN = -1,
    ZERO,
    ONE,
    TWO,
    THREE
};
 
enum class PartMode {
    UNKNOWN = -1,
    EVEN,
    ODD
};
 
enum class SatMode {
    UNKNOWN = -1,
    NO_SAT,
    SAT
};
 
enum class Rnd {
    NO_RND_VALUE,
    RND_VALUE
};
 
enum class StoreMode {
    NOSTORED,
    STORED
};
 
enum class BinLiteral {
    BIN_0,
    BIN_1,
    BIN_2,
    BIN_3,
};
 
enum class Order {
    INC_ORDER_VALUE,
    DEC_ORDER_VALUE
};
 
enum class Mode {
    UNKNOWN,
    MERGING,
    ZEROING
};
 
enum class Pos {
    LOWEST,
    HIGHEST
};
enum class HiloPart {
    Lower,
    Higher
};
enum class PostLiteral {
    POST_MODE_NORAML,
    POST_MODE_UPDATE
};
enum class Pat {
    ALL,
    VL1,
    VL2,
    VL3,
    VL4,
    VL8,
    VL16,
    VL32,
    VL64,
    VL128,
    M3,
    M4,
    H,
    Q,
    ALLF = 15
};
enum class Dist {
    DIST_NORM, // vld, pld, pst
    DIST_BRC_B8,
    DIST_BRC_B16,
    DIST_BRC_B32,
    DIST_US_B8 = 6,
    DIST_US_B16,
    DIST_DS_B8,
    DIST_DS_B16,
    DIST_BDINTLV,
    DIST_DINTLV_B8,
    DIST_DINTLV_B16,
    DIST_UNPK_B8,
    DIST_UNPK_B16,
    DIST_BLK,
    DIST_E2B_B16,
    DIST_E2B_B32,
    DIST_UNPK_B32,
    DIST_DINTLV_B32,
    DIST_UNPK4_B8,
    DIST_SPLT4CHN_B8,
    DIST_SPLT2CHN_B8,
    DIST_SPLT2CHN_B16,
    DIST_US = 1, // pld
    DIST_DS = 2, // pld
    DIST_PK = 1  // pst
};
enum class DistVST {
    DIST_NORM_B8,
    DIST_NORM_B16,
    DIST_NORM_B32,
    DIST_ONEPT_B8,
    DIST_ONEPT_B16,
    DIST_ONEPT_B32,
    DIST_PK_B16,
    DIST_PK_B32,
    DIST_INTLV_B8,
    DIST_INTLV_B16,
    DIST_PK_B64,
    DIST_INTLV_B32,
    DIST_PK4_B32,
    DIST_MRG4CHN_B8,
    DIST_MRG2CHN_B8,
    DIST_MRG2CHN_B16,
    DIST_NORM
};
 
template <typename T, DistVST dist>
__aicore__ inline constexpr DistVST GetDistVst()
{
    if constexpr (dist == DistVST::DIST_NORM) {
        if constexpr (sizeof(T) == 1) {
            return DistVST::DIST_NORM_B8;
        } else if constexpr (sizeof(T) == 2) {
            return DistVST::DIST_NORM_B16;
        } else if constexpr (sizeof(T) == 4) {
            return DistVST::DIST_NORM_B32;
        }
    }
    return dist;
}
 
template <typename T> __aicore__ inline auto CreateAddrReg(uint16_t stride0)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4, "CreateAddrReg only support type b8/b16/b32");
    if constexpr (sizeof(T) == 1) {
        return vag_b8(stride0);
    } else if constexpr (sizeof(T) == 2) {
        return vag_b16(stride0);
    }
    return vag_b32(stride0);
}
 
template <typename T> __aicore__ inline auto CreateAddrReg(uint16_t stride0, uint16_t stride1)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4, "CreateAddrReg only support type b8/b16/b32");
    if constexpr (sizeof(T) == 1) {
        return vag_b8(stride0, stride1);
    } else if constexpr (sizeof(T) == 2) {
        return vag_b16(stride0, stride1);
    }
    return vag_b32(stride0, stride1);
}
 
template <typename T> __aicore__ inline auto CreateAddrReg(uint16_t stride0, uint16_t stride1, uint16_t stride2)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4, "CreateAddrReg only support type b8/b16/b32");
    if constexpr (sizeof(T) == 1) {
        return vag_b8(stride0, stride1, stride2);
    } else if constexpr (sizeof(T) == 2) {
        return vag_b16(stride0, stride1, stride2);
    }
    return vag_b32(stride0, stride1, stride2);
}
 
template <typename T>
__aicore__ inline auto CreateAddrReg(uint16_t stride0, uint16_t stride1, uint16_t stride2, uint16_t stride3)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4, "CreateAddrReg only support type b8/b16/b32");
    if constexpr (sizeof(T) == 1) {
        return vag_b8(stride0, stride1, stride2, stride3);
    } else if constexpr (sizeof(T) == 2) {
        return vag_b16(stride0, stride1, stride2, stride3);
    }
    return vag_b32(stride0, stride1, stride2, stride3);
}
 
// DataCopy
// vlds
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg, LocalMem T *srcUbAddr, int32_t offset)
{
    if constexpr (sizeof(T) == 8) {
        vlds(dstReg, srcUbAddr, offset);  // use default Dist::DIST_DINTLV_B32
    } else {
        constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
        vlds(dstReg, srcUbAddr, offset, distValue);
    }
}
 
template <typename T, PostLiteral postMode, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg, LocalMem T *&srcUbAddr, int32_t offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vlds(dstReg, srcUbAddr, offset, distValue, postValue);
}
 
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, LocalMem T *srcUbAddr, int32_t offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    vlds(dstReg0, dstReg1, srcUbAddr, offset, distValue);
}
 
template <typename T, PostLiteral postMode, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, LocalMem T *&srcUbAddr, int32_t offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vlds(dstReg0, dstReg1, srcUbAddr, offset, distValue, postValue);
}
 
// vld
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg, LocalMem T *srcUbAddr, AddrReg offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    vld(dstReg, srcUbAddr, offset, distValue);
}
 
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg, LocalMem T *srcUbAddr)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    vld(dstReg, srcUbAddr, distValue);
}
 
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, LocalMem T *srcUbAddr, AddrReg offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    vld(dstReg0, dstReg1, srcUbAddr, offset, distValue);
}
 
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, LocalMem T *srcUbAddr)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    vld(dstReg0, dstReg1, srcUbAddr, distValue);
}
 
// vsts
template <typename T, DistVST dist = DistVST::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, RegTensor<T> &srcReg, int32_t offset, MaskReg &mask)
{
    if constexpr (sizeof(T) == 8) {
        vsts(srcReg, dstUbAddr, offset, mask);  // use default Dist::DIST_INTLV_B32
    } else {
        constexpr auto distValue = std::integral_constant<::DistVST, static_cast<::DistVST>(GetDistVst<T, dist>())>();
        vsts(srcReg, dstUbAddr, offset, distValue, mask);
    }
}
 
template <typename T, PostLiteral postMode, DistVST dist = DistVST::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *&dstUbAddr, RegTensor<T> &srcReg, int32_t offset, MaskReg &mask)
{
    constexpr auto distValue = std::integral_constant<::DistVST, static_cast<::DistVST>(GetDistVst<T, dist>())>();
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vsts(srcReg, dstUbAddr, offset, distValue, mask, postValue);
}
 
template <typename T, DistVST dist = DistVST::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, int32_t offset,
    MaskReg &mask)
{
    constexpr auto distValue = std::integral_constant<::DistVST, static_cast<::DistVST>(GetDistVst<T, dist>())>();
    vsts(srcReg0, srcReg1, dstUbAddr, offset, distValue, mask);
}
 
// vst
template <typename T, DistVST dist = DistVST::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, RegTensor<T> &srcReg, AddrReg offset, MaskReg &mask)
{
    constexpr auto distValue = std::integral_constant<::DistVST, static_cast<::DistVST>(GetDistVst<T, dist>())>();
    vst(srcReg, dstUbAddr, offset, distValue, mask);
}
 
template <typename T, DistVST dist = DistVST::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, AddrReg offset,
    MaskReg &mask)
{
    constexpr auto distValue = std::integral_constant<::DistVST, static_cast<::DistVST>(GetDistVst<T, dist>())>();
    vst(srcReg0, srcReg1, dstUbAddr, offset, distValue, mask);
}
 
// vsldb
template <typename T>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg, LocalMem T *srcUbAddr, uint32_t blockStride,
    uint32_t repeatStride, MaskReg &mask)
{
    vsldb(dstReg, srcUbAddr, (blockStride << 16u) | (repeatStride & 0xFFFFU), mask);
}
 
template <typename T, PostLiteral postMode>
__aicore__ inline void DataCopy(RegTensor<T> &dstReg, LocalMem T *&srcUbAddr, uint32_t blockStride,
    uint32_t repeatStride, MaskReg &mask)
{
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vsldb(dstReg, srcUbAddr, (blockStride << 16u) | (repeatStride & 0xFFFFU), mask, postValue);
}
 
// vsstb
template <typename T>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, RegTensor<T> &srcReg, uint32_t blockStride,
    uint32_t repeatStride, MaskReg &mask)
{
    vsstb(srcReg, dstUbAddr, (blockStride << 16u) | (repeatStride & 0xFFFFU), mask);
}
 
template <typename T, PostLiteral postMode>
__aicore__ inline void DataCopy(LocalMem T *&dstUbAddr, RegTensor<T> &srcReg, uint32_t blockStride,
    uint32_t repeatStride, MaskReg &mask)
{
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vsstb(srcReg, dstUbAddr, (blockStride << 16u) | (repeatStride & 0xFFFFU), mask, postValue);
}
 
// plds/psts
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(MaskReg &dstReg, LocalMem T *srcUbAddr, int32_t offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    plds(dstReg, srcUbAddr, offset, distValue);
}
 
template <typename T, PostLiteral post, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(MaskReg &dstReg, LocalMem T *&srcUbAddr, int32_t offset)
{
    // post为plds/psts对应的入参，为0时（NO_POST_UPDATE），每次存在srcUbAddr+offset的地址；
    // 为1时，存在srcUbAddr的地址并对srcUbAddr进行+offset
    // 因为存在预编译的静态assert，这里生成的postValue均为post为1，通过判断入参来确定到底走哪个template
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(Post::POST_UPDATE_VALUE)>();
    if (post == PostLiteral::POST_MODE_UPDATE) {
        plds(dstReg, srcUbAddr, offset, distValue, postValue);
    } else {
        plds(dstReg, srcUbAddr, offset, distValue);
    }
}
 
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, MaskReg &dstReg, int32_t offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    psts(dstReg, dstUbAddr, offset, distValue);
}
 
template <typename T, PostLiteral post, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *&dstUbAddr, MaskReg &dstReg, int32_t offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(Post::POST_UPDATE_VALUE)>();
    if (post == PostLiteral::POST_MODE_UPDATE) {
        psts(dstReg, dstUbAddr, offset, distValue, postValue);
    } else {
        psts(dstReg, dstUbAddr, offset, distValue);
    }
}
 
// vldas/vldus
template <typename T> __aicore__ inline void DataCopyUnAlignPre(UnalignReg &ureg, LocalMem T *srcUbAddr)
{
    vldas(ureg, srcUbAddr);
}
 
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlign(RegTensor<T> &dstReg, UnalignReg &ureg, LocalMem T *&srcUbAddr, uint32_t stride)
{
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vldus(dstReg, ureg, srcUbAddr, stride, postValue);
}
 
template <typename T>
__aicore__ inline void DataCopyUnAlign(RegTensor<T> &dstReg, UnalignReg &ureg, LocalMem T *srcUbAddr)
{
    vldus(dstReg, ureg, srcUbAddr);
}
 
// vlda/vldu
template <typename T> __aicore__ inline void DataCopyUnAlignPre(UnalignReg &ureg, LocalMem T *srcUbAddr, AddrReg &areg)
{
    vlda(ureg, srcUbAddr, areg);
}
 
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlign(RegTensor<T> &dstReg, UnalignReg &ureg, LocalMem T *&srcUbAddr, AddrReg &areg,
    uint32_t inc)
{
    vldu(dstReg, ureg, areg, srcUbAddr, inc);
}
 
// vstus/vstas
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlign(LocalMem T *&dstUbAddr, RegTensor<T> &srcReg, UnalignReg &ureg, uint32_t offset)
{
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vstus(ureg, offset, srcReg, dstUbAddr, postValue);
}
 
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlignPost(LocalMem T *&dstUbAddr, UnalignReg &ureg, int32_t offset)
{
    if constexpr (postMode == PostLiteral::POST_MODE_UPDATE) {
        vstas(ureg, dstUbAddr, offset, POST_UPDATE);
    } else {
        vstas(ureg, dstUbAddr, offset);
    }
}
 
// vstu/vsta
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlign(LocalMem T *&dstUbAddr, RegTensor<T> &srcReg, UnalignReg &ureg, AddrReg &areg)
{
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vstu(ureg, areg, srcReg, dstUbAddr, postValue);
}
 
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlignPost(LocalMem T *&dstUbAddr, UnalignReg &ureg, AddrReg &areg)
{
    vsta(ureg, dstUbAddr, areg);
}
 
// vstur/vstar
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlign(LocalMem T *&dstUbAddr, RegTensor<T> &srcReg, UnalignReg &ureg)
{
    constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
    vstur(ureg, srcReg, dstUbAddr, postValue);
}
 
template <typename T, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE>
__aicore__ inline void DataCopyUnAlignPost(LocalMem T *&dstUbAddr, UnalignReg &ureg)
{
    vstar(ureg, dstUbAddr);
}
 
// vgather2
template <typename T, typename U, typename S>
__aicore__ inline void DataCopyGather(RegTensor<T> &dstReg, LocalMem U *baseAddr, RegTensor<S> &index,
    MaskReg &mask)
{
    static_assert((sizeof(U) == 1 && sizeof(T) == 2 && std::is_same_v<S, uint16_t>) ||
        (sizeof(U) == 2 && sizeof(T) == 2 && std::is_same_v<S, uint16_t>) ||
        (sizeof(U) == 4 && sizeof(T) == 4 && std::is_same_v<S, uint32_t>),
        "DataCopyGather only support src data type b8/b16/b32 with dst type is b16/b16/b32 respectively and each index "
        "type is u16/u16/u32 respectively");
    if constexpr (sizeof(U) == 1 && sizeof(T) == 2) {
        vgather2((vector_s16 &)dstReg, (__ubuf__ int8_t *)baseAddr, index, mask);
    } else if constexpr (sizeof(U) == 2 && sizeof(T) == 2) {
        vgather2((vector_s16 &)dstReg, (__ubuf__ int16_t *)baseAddr, index, mask);
    } else {
        vgather2((vector_s32 &)dstReg, (__ubuf__ int32_t *)baseAddr, index, mask);
    }
}
 
// vgatherb
// mask will be treated as PAT_ALL in this l311 api.
template <typename T>
__aicore__ inline void DataCopyGatherB(RegTensor<T> &dstReg, LocalMem T *baseAddr, RegTensor<uint32_t> &index,
    MaskReg &mask)
{
    vgatherb(dstReg, baseAddr, index);
}
 
// vscatter
template <typename T, typename U>
__aicore__ inline void DataCopyScatter(LocalMem T *baseAddr, RegTensor<T> &srcReg, RegTensor<U> &index,
    MaskReg &mask)
{
    vscatter(srcReg, baseAddr, index, mask);
}
 
// pld/pst
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(MaskReg &dstReg, LocalMem T *srcUbAddr, AddrReg offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    pld(dstReg, srcUbAddr, offset, distValue);
}
 
template <typename T, Dist dist = Dist::DIST_NORM>
__aicore__ inline void DataCopy(LocalMem T *dstUbAddr, MaskReg &dstReg, AddrReg offset)
{
    constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
    pst(dstReg, dstUbAddr, offset, distValue);
}
 
template <typename T> __aicore__ inline MaskReg CreatePredicate(uint32_t &scalar)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4, "CreatePredicate only support type b8/b16/b32");
 
    MaskReg reg;
    if constexpr (sizeof(T) == 1) {
        reg = plt_b8(scalar, POST_UPDATE);
    } else if constexpr (sizeof(T) == 2) {
        reg = plt_b16(scalar, POST_UPDATE);
    } else if constexpr (sizeof(T) == 4) {
        reg = plt_b32(scalar, POST_UPDATE);
    }
    return reg;
}
 
template <typename T, Pat mode = Pat::ALL> __aicore__ inline MaskReg CreatePredicate()
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4, "CreatePredicate only support type b8/b16/b32");
 
    constexpr auto modeValue = std::integral_constant<::Pat, static_cast<::Pat>(mode)>();
    MaskReg reg;
    if constexpr (sizeof(T) == 1) {
        reg = pset_b8(modeValue);
    } else if constexpr (sizeof(T) == 2) {
        reg = pset_b16(modeValue);
    } else if constexpr (sizeof(T) == 4) {
        reg = pset_b32(modeValue);
    }
    return reg;
}
 
template <HiloPart part = HiloPart::Lower> __aicore__ inline void PredicatePack(MaskReg &dstMask, MaskReg &srcMask)
{
    constexpr auto partValue = std::integral_constant<::HiloPart, static_cast<::HiloPart>(part)>();
    ppack(dstMask, srcMask, partValue);
}
 
template <HiloPart part = HiloPart::Lower> __aicore__ inline void PredicateUnPack(MaskReg &dstMask, MaskReg &srcMask)
{
    constexpr auto partValue = std::integral_constant<::HiloPart, static_cast<::HiloPart>(part)>();
    punpack(dstMask, srcMask, partValue);
}
 
template <typename T>
__aicore__ inline void PredicateInterleave(MaskReg &dstMask0, MaskReg &dstMask1, MaskReg &srcMask0, MaskReg &srcMask1)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4,
        "PredicateInterleave only support type b8/b16/b32 ");
    if constexpr (sizeof(T) == 1) {
        pintlv_b8(dstMask0, dstMask1, srcMask0, srcMask1);
    } else if constexpr (sizeof(T) == 2) {
        pintlv_b16(dstMask0, dstMask1, srcMask0, srcMask1);
    } else if constexpr (sizeof(T) == 4) {
        pintlv_b32(dstMask0, dstMask1, srcMask0, srcMask1);
    }
}
 
template <typename T>
__aicore__ inline void PredicateDeInterleave(MaskReg &dstMask0, MaskReg &dstMask1, MaskReg &srcMask0, MaskReg &srcMask1)
{
    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4,
        "PredicateDeInterleave only support type b8/b16/b32 ");
    if constexpr (sizeof(T) == 1) {
        pdintlv_b8(dstMask0, dstMask1, srcMask0, srcMask1);
    } else if constexpr (sizeof(T) == 2) {
        pdintlv_b16(dstMask0, dstMask1, srcMask0, srcMask1);
    } else if constexpr (sizeof(T) == 4) {
        pdintlv_b32(dstMask0, dstMask1, srcMask0, srcMask1);
    }
}
 
template <typename T>
__aicore__ inline MaskReg MovePredicate()
{
    static_assert(SupportBytes<T, 2, 4>(), "MovePredicate only support type b16/b32");
    if constexpr (sizeof(T) == 2) {
        return movp_b16();
    } else {
        return movp_b32();
    }
}
template <Mode mode> __aicore__ inline constexpr auto GetMaskMergeMode()
{
// To avoid naming conflicts of mode struct in cpu debug.
#if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1
    return std::integral_constant<::CpuMode, static_cast<::CpuMode>(mode)>();
#else
    return std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
#endif
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Add(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vadd(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Sub(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vsub(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Mul(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmul(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Div(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vdiv(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Max(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmax(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Min(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmin(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void ShiftLeft(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<U> &srcReg1,
    MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vshl(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void ShiftRight(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<U> &srcReg1,
    MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vshr(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void And(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vand(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Or(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vor(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Xor(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vxor(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void Round(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<U> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vrnd(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Prelu(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vprelu(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T>
__aicore__ inline void Mod(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<Mode::ZEROING>();;
    vdiv(dstReg, srcReg0, srcReg1, mask, modeValue);
    vmul(dstReg, srcReg1, dstReg, mask, modeValue);
    vsub(dstReg, srcReg0, dstReg, mask, modeValue);
}
 
template <typename T>
__aicore__ inline void Mull(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1,
                            MaskReg &mask)
{
    vmull(dstReg0, dstReg1, srcReg0, srcReg1, mask);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Mula(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmula(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T>
__aicore__ inline void AddCarryOut(MaskReg &carryp, RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1,
                                   MaskReg &mask)
{
    vaddc(carryp, dstReg, srcReg0, srcReg1, mask);
}
 
template <typename T>
__aicore__ inline void SubCarryOut(MaskReg &carryp, RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1,
                                   MaskReg &mask)
{
    vsubc(carryp, dstReg, srcReg0, srcReg1, mask);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void SaturationAdd(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vsadd(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void SaturationSub(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vssub(dstReg, srcReg0, srcReg1, mask, modeValue);
}
 
template <typename T, Rnd rnd, Mode mode = Mode::ZEROING>
__aicore__ inline void Average(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vavg(dstReg, srcReg0, srcReg1, rnd, mask, modeValue);
}
 
template <typename T>
__aicore__ inline void Addcs(MaskReg &carryp, RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1,
                             MaskReg &carrysrcp, MaskReg &mask)
{
    vaddcs(carryp, dstReg, srcReg0, srcReg1, carrysrcp, mask);
}
 
template <typename T>
__aicore__ inline void Subcs(MaskReg &carryp, RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1,
                             MaskReg &carrysrcp, MaskReg &mask)
{
    vsubcs(carryp, dstReg, srcReg0, srcReg1, carrysrcp, mask);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void Adds(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vadds(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void Muls(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmuls(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void Maxs(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmaxs(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void Mins(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vmins(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void ShiftLefts(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vshls(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void ShiftRights(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vshrs(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void Rounds(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vrnds(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void LeakyRelu(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, T scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vlrelu(dstReg, srcReg0, scalar, mask, modeValue);
}
 
template <typename T, CMPMODE mode = CMPMODE::EQ>
__aicore__ inline void Compare(MaskReg &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    if constexpr (mode == CMPMODE::EQ) {
        vcmp_eq(dstReg, srcReg0, srcReg1, mask);
    } else if constexpr (mode == CMPMODE::NE) {
        vcmp_ne(dstReg, srcReg0, srcReg1, mask);
    } else if constexpr (mode == CMPMODE::GT) {
        vcmp_gt(dstReg, srcReg0, srcReg1, mask);
    } else if constexpr (mode == CMPMODE::GE) {
        vcmp_ge(dstReg, srcReg0, srcReg1, mask);
    } else if constexpr (mode == CMPMODE::LT) {
        vcmp_lt(dstReg, srcReg0, srcReg1, mask);
    } else if constexpr (mode == CMPMODE::LE) {
        vcmp_le(dstReg, srcReg0, srcReg1, mask);
    }
}
 
template <typename T, CMPMODE mode = CMPMODE::EQ>
__aicore__ inline void CompareScalar(MaskReg &dstReg, RegTensor<T> &srcReg0, T scalar, MaskReg &mask)
{
    if constexpr (mode == CMPMODE::EQ) {
        vcmps_eq(dstReg, srcReg0, scalar, mask);
    } else if constexpr (mode == CMPMODE::NE) {
        vcmps_ne(dstReg, srcReg0, scalar, mask);
    } else if constexpr (mode == CMPMODE::GT) {
        vcmps_gt(dstReg, srcReg0, scalar, mask);
    } else if constexpr (mode == CMPMODE::GE) {
        vcmps_ge(dstReg, srcReg0, scalar, mask);
    } else if constexpr (mode == CMPMODE::LT) {
        vcmps_lt(dstReg, srcReg0, scalar, mask);
    } else if constexpr (mode == CMPMODE::LE) {
        vcmps_le(dstReg, srcReg0, scalar, mask);
    }
}
 
template <typename T>
__aicore__ inline void Select(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1, MaskReg &mask)
{
    vsel(dstReg, srcReg0, srcReg1, mask);
}
 
template <typename T>
__aicore__ inline void Slide(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1,
    uint16_t slideAmount)
{
    vslide(dstReg, srcReg0, srcReg1, slideAmount);
}
 
template <typename T>
__aicore__ inline void Selr(RegTensor<T> &dstReg, RegTensor<T> &srcReg0, RegTensor<T> &srcReg1)
{
    vselr(dstReg, srcReg0, srcReg1);
}
 
template <typename T, Order order = Order::INC_ORDER_VALUE, typename U>
__aicore__ inline void CreateVecIndex(RegTensor<T> &dstReg, U scalar)
{
    constexpr auto orderMode = std::integral_constant<::Order, static_cast<::Order>(order)>();
    vci(dstReg, (T)scalar, orderMode);
}
 
template <typename T, typename U>
__aicore__ inline void CreateVecIndexWithPattern(RegTensor<T> &dstReg, U scalar)
{
    vcp(dstReg, (T)scalar);
}
 
template <typename T, typename U, Mode mode = Mode::ZEROING>
__aicore__ inline void ReduceSum(RegTensor<T> &dstReg, RegTensor<U> srcReg0, MaskReg mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcadd(dstReg, srcReg0, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void ReduceMax(RegTensor<T> &dstReg, RegTensor<T> srcReg0, MaskReg mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcmax(dstReg, srcReg0, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void ReduceMin(RegTensor<T> &dstReg, RegTensor<T> srcReg0, MaskReg mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcmin(dstReg, srcReg0, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void ReduceMax(RegTensor<T> &dstReg, MaskReg &dstReg1, RegTensor<T> srcReg0, MaskReg pReg)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcbmax(dstReg, dstReg1, srcReg0, pReg, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void ReduceMin(RegTensor<T> &dstReg, MaskReg &dstReg1, RegTensor<T> srcReg0, MaskReg pReg)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcbmin(dstReg, dstReg1, srcReg0, pReg, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void BlockReduceSum(RegTensor<T> &dstReg, RegTensor<T> srcReg0, MaskReg pReg)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcgadd(dstReg, srcReg0, pReg, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void BlockReduceMax(RegTensor<T> &dstReg, RegTensor<T> srcReg0, MaskReg pReg)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcgmax(dstReg, srcReg0, pReg, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void BlockReduceMin(RegTensor<T> &dstReg, RegTensor<T> srcReg0, MaskReg pReg)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcgmin(dstReg, srcReg0, pReg, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void PairReduceSum(RegTensor<T> &dstReg, RegTensor<T> srcReg0, MaskReg pReg)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vcpadd(dstReg, srcReg0, pReg, modeValue);
}
 
template<typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Axpy(RegTensor<T> &dstReg, RegTensor<T> srcReg, const T scalar, const MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vaxpy(dstReg, srcReg, scalar, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Abs(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vabs(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Relu(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vrelu(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Exp(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vexp(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Sqrt(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vsqrt(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Rsqrt(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vrsqrt(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Rec(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vrec(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Ln(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vln(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Neg(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vneg(dstReg, srcReg, mask, modeValue);
}
 
template <typename T, Mode mode = Mode::ZEROING>
__aicore__ inline void Not(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vnot(dstReg, srcReg, mask, modeValue);
}
 
template<RoundMode rndMode>
struct RndConverter {
    static constexpr decltype(auto) value = ROUND_H;
};
 
template<>
struct RndConverter<RoundMode::CAST_RINT> {
    static constexpr decltype(auto) value = ROUND_R;
};
 
template<>
struct RndConverter<RoundMode::CAST_ROUND> {
    static constexpr decltype(auto) value = ROUND_A;
};
 
template<>
struct RndConverter<RoundMode::CAST_FLOOR> {
    static constexpr decltype(auto) value = ROUND_F;
};
 
template<>
struct RndConverter<RoundMode::CAST_CEIL> {
    static constexpr decltype(auto) value = ROUND_C;
};
 
template<>
struct RndConverter<RoundMode::CAST_TRUNC> {
    static constexpr decltype(auto) value = ROUND_Z;
};
 
template<>
struct RndConverter<RoundMode::CAST_ODD> {
    static constexpr decltype(auto) value = ROUND_O;
};
 
template <typename T, typename U, RoundMode roundMode, Mode mode, SatMode satMode, PartMode partMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) rnd = RndConverter<roundMode>::value;
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) sat = std::integral_constant<::RoundingSaturation, static_cast<::RoundingSaturation>(satMode)>();
    constexpr decltype(auto) part = std::integral_constant<::Part, static_cast<::Part>(partMode)>();
    vcvt(dstReg, srcReg, mask, rnd, sat, part, md);
}
 
template <typename T, typename U, RoundMode roundMode, Mode mode, SatMode satMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) rnd = RndConverter<roundMode>::value;
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) sat = std::integral_constant<::RoundingSaturation, static_cast<::RoundingSaturation>(satMode)>();
    vcvt(dstReg, srcReg, mask, rnd, sat, md);
}
 
template <typename T, typename U, RoundMode roundMode, Mode mode, SatMode satMode, PPMode ppMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) rnd = RndConverter<roundMode>::value;
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) sat = std::integral_constant<::RoundingSaturation, static_cast<::RoundingSaturation>(satMode)>();
    constexpr decltype(auto) pp = std::integral_constant<::Part_T, static_cast<::Part_T>(ppMode)>();
    vcvt(dstReg, srcReg, mask, rnd, sat, pp, md);
}
 
template <typename T, typename U, RoundMode roundMode, Mode mode, PartMode partMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) rnd = RndConverter<roundMode>::value;
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) part = std::integral_constant<::Part, static_cast<::Part>(partMode)>();
    vcvt(dstReg, srcReg, mask, rnd, part, md);
}
 
template <typename T, typename U, Mode mode, SatMode satMode, PartMode partMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) sat = std::integral_constant<::RoundingSaturation, static_cast<::RoundingSaturation>(satMode)>();
    constexpr decltype(auto) part = std::integral_constant<::Part, static_cast<::Part>(partMode)>();
    vcvt(dstReg, srcReg, mask, sat, part, md);
}
 
template <typename T, typename U, Mode mode, SatMode satMode, PPMode ppMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) sat = std::integral_constant<::RoundingSaturation, static_cast<::RoundingSaturation>(satMode)>();
    constexpr decltype(auto) pp = std::integral_constant<::Part_T, static_cast<::Part_T>(ppMode)>();
    vcvt(dstReg, srcReg, mask, sat, pp, md);
}
 
template <typename T, typename U, PartMode partMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg) {
    constexpr decltype(auto) part = std::integral_constant<::Part, static_cast<::Part>(partMode)>();
    vcvt(dstReg, srcReg, part);
}
 
template <typename T, typename U, Mode mode, PartMode partMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) part = std::integral_constant<::Part, static_cast<::Part>(partMode)>();
    vcvt(dstReg, srcReg, mask, part, md);
}
 
template <typename T, typename U, PPMode ppMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg) {
    constexpr decltype(auto) pp = std::integral_constant<::Part_T, static_cast<::Part_T>(ppMode)>();
    vcvt(dstReg, srcReg, pp);
}
 
template <typename T, typename U, Mode mode, PPMode ppMode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    constexpr decltype(auto) pp = std::integral_constant<::Part_T, static_cast<::Part_T>(ppMode)>();
    if constexpr (IsSameType<T, half>::value && IsSameType<U, int8_t>::value) {
        vcvt_s42f16(dstReg, srcReg, mask, pp, md);
    } else {
        vcvt(dstReg, srcReg, mask, pp, md);
    }
}
 
template <typename T, typename U, RoundMode roundMode, Mode mode>
__aicore__ inline void Cast(RegTensor<T> &dstReg, RegTensor<U> &srcReg, MaskReg &mask) {
    constexpr decltype(auto) rnd = RndConverter<roundMode>::value;
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    vcvt(dstReg, srcReg, mask, rnd, md);
}
 
template<typename T, RoundMode roundMode, Mode mode = Mode::ZEROING>
__aicore__ inline void Truncate(RegTensor<T>& dstReg, RegTensor<T>& srcReg, MaskReg &mask) {
    constexpr decltype(auto) rnd = RndConverter<roundMode>::value;
    constexpr decltype(auto) md = std::integral_constant<::Mode, static_cast<::Mode>(mode)>();
    vtrc(dstReg, srcReg, rnd, mask, md);
}
 
template <typename T, typename U> __aicore__ inline void DuplicateImpl(RegTensor<T> &dstReg, U scalar)
{
    vbr(dstReg, (T)scalar);
}
 
 
template <typename T, typename U> __aicore__ inline void Duplicate(RegTensor<T> &dstReg, U scalar)
{
    vbr(dstReg, (T)scalar);
}
 
template <typename T, Mode mode = Mode::ZEROING, typename U>
__aicore__ inline void Duplicate(RegTensor<T> &dstReg, U scalar, MaskReg &mask)
{
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vdup(dstReg, (T)scalar, mask, modeValue);
}
 
template <typename T, Pos pos = Pos::LOWEST, Mode mode = Mode::ZEROING>
__aicore__ inline void Duplicate(RegTensor<T> &dstReg, RegTensor<T> &srcReg, MaskReg &mask)
{
    constexpr auto posValue = std::integral_constant<::Pos, static_cast<::Pos>(pos)>();
    constexpr auto modeValue = GetMaskMergeMode<mode>();;
    vdup(dstReg, srcReg, mask, posValue, modeValue);
}
 
template <typename T>
__aicore__ inline void Interleave(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, RegTensor<T> &srcReg0,
    RegTensor<T> &srcReg1)
{
    vintlv(dstReg0, dstReg1, srcReg0, srcReg1);
}
 
template <typename T>
__aicore__ inline void DeInterleave(RegTensor<T> &dstReg0, RegTensor<T> &dstReg1, RegTensor<T> &srcReg0,
    RegTensor<T> &srcReg1)
{
    vdintlv(dstReg0, dstReg1, srcReg0, srcReg1);
}

} // namespace AscendC
#endif // ASCENDC_MODULE_OPERATOR_COMMON_IMPL_H
